1 WorkFlow

  1. Exploratory Analysis:
  1. ts-plot : overview whole data (trend or outliers)
  2. box-plot by month : monthly effect
  1. De-Trend :
  1. Determinist Trend(linear model)
  2. Stochastic Trend (Difference)
  1. De-Seasonal :
  1. Determinist Seasonal(season as dummy variable)
  2. Stochastic seasonal(Difference with lag)
  1. Fit Model ARMA(p, q) to residuals

  2. Diagnose model(last residuals should be White Noise)

  3. Simulate prediction

2 Exploratory Analysis

2.1 Overview Potential Data

DT::datatable(potential_data)
summary(potential_data)
##      Total             A1              A2       
##  Min.   : 3086   Min.   : 81.0   Min.   : 2888  
##  1st Qu.:11692   1st Qu.:146.2   1st Qu.:11495  
##  Median :15630   Median :173.5   Median :15462  
##  Mean   :16748   Mean   :185.1   Mean   :16563  
##  3rd Qu.:23508   3rd Qu.:223.0   3rd Qu.:23365  
##  Max.   :29869   Max.   :339.0   Max.   :29696
potential_data_ts = ts(potential_data,
                       start = c(2000,1),
                       freq = 12)
plot(potential_data_ts, main = 'ts-plot')

boxplot(potential_data_ts)

Due to extremely low numbers of A1 class and the similarity between Total and A2 class, let’s focus on A2 class as our research target.

2.2 Target Data Summary

target_data_ts = ts(target_data,start = c(2000,1),freq = 12)


boxplot(target_data_ts ~cycle(target_data_ts ),
        main = target_column , 
        xlab = 'month',
        ylab = target_column )

2.3 Training Data

len = length(target_data)
data = target_data[1:(len-forecast_num)]
data_ts = ts(data,start = c(2000,1),freq = 12)

test = target_data[(len-forecast_num+1):len]
test_ts = ts(test,end = c(2019,2),freq = 12)

3 Determinist Method

3.1 Determinist-trend

3.1.1 Linear Model with intercept

model_lm = lm(data_ts~time(data_ts))
summary(model_lm)
## 
## Call:
## lm(formula = data_ts ~ time(data_ts))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4578.8  -989.7   -69.1   971.6  5865.3 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.626e+06  4.229e+04  -62.10   <2e-16 ***
## time(data_ts)  1.315e+03  2.105e+01   62.48   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1608 on 214 degrees of freedom
## Multiple R-squared:  0.948,  Adjusted R-squared:  0.9478 
## F-statistic:  3904 on 1 and 214 DF,  p-value: < 2.2e-16
plot(data_ts, main = 'Linear model with intercept');abline(reg = model_lm,col = 'red')

3.1.2 Residual Analysis

## original : for follow-up analysis
res_lm = residuals(model_lm)
plot(y=res_lm ,x=as.vector(time(data_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals : LM de-trend')

## standard : observe outliers (some issues sor events)
std_res_lm = ts(rstudent(model_lm),
                start = c(2000,1),freq = 12)
plot(y=std_res_lm ,x=as.vector(time(std_res_lm)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard residuals : LM de-trend');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)

hist(std_res_lm ,
     breaks = 30,
     xlab='Standardized Residuals', 
     main = 'Residuals Histogram')

qqnorm(std_res_lm , main = 'QQ plot');qqline(std_res_lm , col = 'red')

ks.test(std_res_lm ,"pnorm")
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  std_res_lm
## D = 0.034903, p-value = 0.955
## alternative hypothesis: two-sided
acf(res_lm, main = 'ACF of Residuals', lag.max = 60)

After de-trend process, residuals are considered to be normal distribution , but not independent random variable. Therefore, it’s clearly proper to treat the data as a time series.

3.1.3 Linear Model w.o. intercept

model_lm_woi = lm(data_ts~time(data_ts)-1)
summary(model_lm_woi)
## 
## Call:
## lm(formula = data_ts ~ time(data_ts) - 1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13052  -4927  -1406   6524  13637 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## time(data_ts)   7.9699     0.2369   33.64   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6995 on 215 degrees of freedom
## Multiple R-squared:  0.8403, Adjusted R-squared:  0.8396 
## F-statistic:  1132 on 1 and 215 DF,  p-value: < 2.2e-16
plot(data_ts, main='Linear Model w.o. intercept');abline(reg = model_lm_woi, col = 'red')

plot(data_ts, main='Linear Model w.o. intercept (start at (0,0))',
     xlim = c(0,2018), ylim = c(0,26000));abline(reg = model_lm_woi, col = 'red')

According to the plot above, the linear model without intercept may not proper to fit this data due to the restriction of passing through origin coordinates.

3.2 Determinist-season

Attention : We continue the seasonal analysis with residual data from de-trend process instead of original data.

3.2.1 Summary

season_data = res_lm

season_data_ts = ts(season_data,start = c(2000,1),freq = 12)

plot(y=season_data_ts  ,x=as.vector(time(season_data_ts )),
xlab='Time',ylab='Residuals',type='o',
main = paste0('Residuals of ',target_column,' after de-trend'))

boxplot(season_data_ts~cycle(season_data_ts),
        main = paste0('Residual of ',
                      target_column , 
                      ' after de-trend'), 
        xlab = 'month',
        ylab = 'Residual')

It’s obviously that the occurrence of motor vehicle accident in February is lower than other months. Since means of twelves months are not equal,the seasonal effect should’nt be ignored.

3.2.2 Seasonal Model w.o intercept

mon=season(season_data_ts) 

model_season = lm(season_data_ts~mon-1)

summary(model_season)
## 
## Call:
## lm(formula = season_data_ts ~ mon - 1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4548.1  -976.3    73.0   927.8  4433.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## monJanuary    1086.50     331.48   3.278  0.00123 ** 
## monFebruary  -1879.83     331.48  -5.671 4.81e-08 ***
## monMarch       -78.44     331.48  -0.237  0.81318    
## monApril      -691.49     331.48  -2.086  0.03821 *  
## monMay         206.12     331.48   0.622  0.53476    
## monJune        -46.82     331.48  -0.141  0.88780    
## monJuly        232.18     331.48   0.700  0.48446    
## monAugust     -520.93     331.48  -1.572  0.11761    
## monSeptember  -576.88     331.48  -1.740  0.08331 .  
## monOctober     498.68     331.48   1.504  0.13402    
## monNovember    339.51     331.48   1.024  0.30693    
## monDecember   1431.41     331.48   4.318 2.45e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1406 on 204 degrees of freedom
## Multiple R-squared:  0.2706, Adjusted R-squared:  0.2277 
## F-statistic: 6.306 on 12 and 204 DF,  p-value: 1.87e-09
model_season_fit = ts(fitted(model_season),start = c(2000,1),freq = 12)

plot(model_season_fit , 
     main = 'Seasonal model w.o. intercept',
     col = 'red',
     ylim = c(min(season_data_ts),max(season_data_ts)));points(season_data_ts, col = 'black')

## residual
res_season = residuals(model_season)
res_season_ts = ts(res_season, 
                      start = c(2000,1),
                      freq = 12)


plot(y=res_season_ts ,x=as.vector(time(res_season_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals : de-trend & de-season')

### standard residual
std_res_season = rstudent(model_season)

plot(y=std_res_season  ,x=as.vector(time(std_res_season)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard Residuals : de-trend & de-season');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)

hist(std_res_season ,
     breaks = 30,
     xlab='Standardized Residuals', 
     main = 'Residuals Histogram')

qqnorm(std_res_season , main = 'QQ plot');qqline(std_res_lm , col = 'red')

ks.test(std_res_season ,"pnorm")
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  std_res_season
## D = 0.037843, p-value = 0.9165
## alternative hypothesis: two-sided

According to the report, Jan, Feb, Apr and Dec are significant months in seasonal model; On the other side ,the plot of fitted values showes that the seasonal model still doesn’t fully explain the data.

After de-trend and de-season process, now lets check the acf and pacf and try to fit ARMA model; By the way, interestingly, ks-test still doesn’t reject the hypothesis of normal-distribution assumption.

3.2.3 Seasonal Model with intercept

month_=season(season_data_ts) 

model_season_wi=lm(season_data_ts~month_)

summary(model_season_wi)
## 
## Call:
## lm(formula = season_data_ts ~ month_)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4548.1  -976.3    73.0   927.8  4433.9 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1086.5      331.5   3.278 0.001230 ** 
## month_February   -2966.3      468.8  -6.328 1.55e-09 ***
## month_March      -1164.9      468.8  -2.485 0.013758 *  
## month_April      -1778.0      468.8  -3.793 0.000196 ***
## month_May         -880.4      468.8  -1.878 0.061804 .  
## month_June       -1133.3      468.8  -2.418 0.016502 *  
## month_July        -854.3      468.8  -1.822 0.069853 .  
## month_August     -1607.4      468.8  -3.429 0.000733 ***
## month_September  -1663.4      468.8  -3.548 0.000481 ***
## month_October     -587.8      468.8  -1.254 0.211301    
## month_November    -747.0      468.8  -1.593 0.112604    
## month_December     344.9      468.8   0.736 0.462732    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1406 on 204 degrees of freedom
## Multiple R-squared:  0.2706, Adjusted R-squared:  0.2312 
## F-statistic: 6.879 on 11 and 204 DF,  p-value: 7.843e-10
model_season_wi_fit = ts(fitted(model_season_wi),start = c(2000,1),freq = 12)

ts.plot(model_season_fit,model_season_wi_fit,col = c(2,4),lwd = c(2,2),lty = c(2,3), main = 'Seasonal Means Model',
     ylim = c(min(season_data_ts),max(season_data_ts)));points(season_data_ts, col = 'black')

Otherwise, it excludes the January effect in seasonal model with intercept, but fitted values are totally the same with ones without intercept; for the sake, the following analysis output will not differ with each seasonal model result.

3.2.4 Alternative : Harnonic Function

har =harmonic(season_data_ts, 6)

model_har = lm(season_data_ts~har)

summary(model_har)
## 
## Call:
## lm(formula = season_data_ts ~ har)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4548.1  -976.3    73.0   927.8  4433.9 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -2.199e-10  9.569e+01   0.000 1.000000    
## harcos(2*pi*t)   2.123e+02  1.353e+02   1.569 0.118303    
## harcos(4*pi*t)   1.764e+02  1.353e+02   1.303 0.193932    
## harcos(6*pi*t)   3.708e+01  1.353e+02   0.274 0.784346    
## harcos(8*pi*t)   2.815e+02  1.353e+02   2.080 0.038782 *  
## harcos(10*pi*t)  1.778e+02  1.353e+02   1.314 0.190326    
## harcos(12*pi*t)  2.015e+02  9.569e+01   2.106 0.036449 *  
## harsin(2*pi*t)  -3.821e+02  1.353e+02  -2.824 0.005220 ** 
## harsin(4*pi*t)  -7.197e+02  1.353e+02  -5.318 2.74e-07 ***
## harsin(6*pi*t)  -2.745e+02  1.353e+02  -2.028 0.043821 *  
## harsin(8*pi*t)  -3.730e+02  1.353e+02  -2.757 0.006372 ** 
## harsin(10*pi*t) -4.875e+02  1.353e+02  -3.602 0.000396 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1406 on 204 degrees of freedom
## Multiple R-squared:  0.2706, Adjusted R-squared:  0.2312 
## F-statistic: 6.879 on 11 and 204 DF,  p-value: 7.843e-10
model_har_fit = ts(fitted(model_har),
                   start = c(2000,1),freq = 12)

ts.plot(season_data_ts, lty = c(1,3), col=c(1,4),
        main = 'Harmonic(6)', 
        ylim = c(min(model_har_fit,season_data_ts),
                max(model_har_fit,season_data_ts)));points(season_data_ts);lines(model_har_fit,col="red")

3.3 ARMA

3.3.1 ACF and PACF

arma_data = res_season
arma_data_ts = ts(arma_data,start = c(2000,1),freq = 12)

plot(y=arma_data_ts  ,x=as.vector(time(arma_data_ts)),
xlab='Time',ylab='Residuals',type='o',
main = paste0('Residuals of ',
              target_column ,
              ' after de-trend and de-season'))

acf(arma_data, lag = 60, main = 'ACF of Residuals after de-trend and de season')

pacf(arma_data, lag = 60, main = 'PACF of Residuals after de-trend and de season')

The tail-off ACF plot with cuts-off PACF plot implies the AR(p) model. Because of cuts-off after lag 1 in PACF plot, we first give AR(1) or SARMA(1,0)(1,0)[6] a try.

3.3.2 AR(1)

3.3.2.1 Model

model_ar1 = Arima(arma_data_ts, order = c(1,0,0))
summary(model_ar1)
## Series: arma_data_ts 
## ARIMA(1,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1      mean
##       0.8025  -96.8391
## s.e.  0.0420  284.3951
## 
## sigma^2 estimated as 707878:  log likelihood=-1760.77
## AIC=3527.53   AICc=3527.64   BIC=3537.66
## 
## Training set error measures:
##                    ME     RMSE      MAE    MPE     MAPE      MASE
## Training set 8.801504 837.4505 649.3579 40.606 230.7969 0.6335582
##                    ACF1
## Training set -0.2102008
model_ar1_fit = fitted(model_ar1)

ts.plot(arma_data_ts, model_ar1_fit,col = c(1,2),lwd = c(2,1),lty = c(2,1), main = 'AR(1) Model',
     ylim = c(min(arma_data_ts),max(arma_data_ts)));points(arma_data_ts, col = 'black')

3.3.2.2 Residuals

## residual
res_ar1  = residuals(model_ar1) ## model_ar1$residuals
res_ar1_ts = ts(res_ar1, start = c(2000,1), freq = 12)


plot(y=res_ar1_ts ,x=as.vector(time(res_ar1_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals of AR(1) after de-trend & de-season')

## standard residual
## rstudent not work to Arima object, but I found rstandard's output not equal to rstudent with seasonal model

std_res_ar1 = rstandard(model_ar1) 

plot(y=std_res_ar1  ,x=as.vector(time(std_res_ar1)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard Residuals of AR(1) after de-trend & de-season');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)

hist(std_res_ar1 ,
     breaks = 30,
     xlab='Standardized Residuals', 
     main = 'Residuals Histogram')

qqnorm(std_res_ar1 , main = 'QQ plot');qqline(std_res_ar1 , col = 'red')

ks.test(std_res_ar1 ,"pnorm")
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  std_res_ar1
## D = 0.043488, p-value = 0.8086
## alternative hypothesis: two-sided

3.3.2.3 White Noise Diagnose

acf(as.vector(res_ar1), lag = 60, main = 'ACF : Residuals of AR(1)')

pacf(as.vector(res_ar1), lag = 60, main = 'PACF : Residuals of AR(1)')

3.3.3 SARIMA(1,0,0)(1,0,0)[6]

3.3.3.1 Model

model_ar1010 = Arima(arma_data_ts, 
                  order = c(1,0,0),
                  seasonal = list(order = c(1,0,0),
                                  period = 6))
summary(model_ar1010)
## Series: arma_data_ts 
## ARIMA(1,0,0)(1,0,0)[6] with non-zero mean 
## 
## Coefficients:
##          ar1    sar1       mean
##       0.7119  0.3859  -132.1882
## s.e.  0.0540  0.0729   294.3333
## 
## sigma^2 estimated as 625227:  log likelihood=-1747.22
## AIC=3502.45   AICc=3502.63   BIC=3515.95
## 
## Training set error measures:
##                    ME     RMSE      MAE     MPE     MAPE      MASE
## Training set 9.343143 785.2024 606.2786 52.1924 211.0902 0.5915271
##                   ACF1
## Training set -0.203691
model_ar1010_fit = fitted(model_ar1010)

ts.plot(arma_data_ts, model_ar1010_fit,col = c(1,2),lwd = c(2,1),lty = c(2,1), main = 'SARIMA(1,0,0)(1,0,0)[6] Model',
     ylim = c(min(arma_data_ts),max(arma_data_ts)));points(arma_data_ts, col = 'black')

3.3.3.2 Residuals

## residual
res_ar1010  = residuals(model_ar1010) ## model_ar1$residuals
res_ar1010_ts = ts(res_ar1010, start = c(2000,1), freq = 12)


plot(y=res_ar1010_ts ,x=as.vector(time(res_ar1010_ts)),
xlab='Time',ylab='Residuals',type='o',
main = 'Residuals of SARIMA(1,0,0)(1,0,0)[6]')

## standard residual
## rstudent not work to Arima object, but I found rstandard's output not equal to rstudent with seasonal model

std_res_ar1010 = rstandard(model_ar1010) 

plot(y=std_res_ar1010  ,x=as.vector(time(std_res_ar1010)),
xlab='Time',ylab='Standardized Residuals',type='o',
main = 'Satndard Residuals of SARIMA(1,0,0)(1,0,0)[6]');abline(a = 3,b = 0, col = 'red', lty = 2);abline(a = -3,b = 0, col = 'red', lty = 2);abline(a = 2,b = 0, col = 'orange', lty = 2);abline(a = -2,b = 0, col = 'orange', lty = 2);abline(a = 0,b = 0, col = 'grey', lty = 2)

hist(std_res_ar1010 ,
     breaks = 30,
     xlab='Standardized Residuals', 
     main = 'Residuals Histogram')

qqnorm(std_res_ar1010 , main = 'QQ plot');qqline(std_res_ar1010 , col = 'red')

ks.test(std_res_ar1010 ,"pnorm")
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  std_res_ar1010
## D = 0.052486, p-value = 0.5913
## alternative hypothesis: two-sided

3.3.3.3 White Noise Diagnose

acf(as.vector(res_ar1010), lag = 60, main = 'ACF : Residuals of SARIMA(1,0,0)(1,0,0)[6]')

pacf(as.vector(res_ar1010), lag = 60, main = 'PACF : Residuals of SARIMA(1,0,0)(1,0,0)[6]')

3.3.4 Potential parameters for ARMA(p,q)

3.3.4.1 Method I : EACF

eacf(arma_data)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x x x x x x x x x x x  x  x  x 
## 1 x o o x o x o o o x o  x  o  o 
## 2 x o o x o x o o o x o  x  o  o 
## 3 o o o x o x o o o o o  x  o  o 
## 4 o x x x o x o o o o o  x  o  o 
## 5 x x x x x o x o o o o  x  o  o 
## 6 x x x o x x o o o o o  o  o  o 
## 7 x x x o o x o o o o o  o  o  o
# eacf_info = eacf(arma_data)
# eacf_sig = eacf_info$eacf
# eacf_sig < 0.05

By EACF matrix, ARMA(1,1) is our candidate for modeling.

3.3.4.2 Method II : auto.arima

Attention: Because we early import deterministic trend and seasonal model; to prevent it from stochastic effect, we set zero to parameters d, D, max.P and max.Q.

auto_arima = auto.arima(arma_data_ts,
           d = 0, D = 0,
           trace = T)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : 3471.228
##  ARIMA(0,0,0)            with non-zero mean : 3736.151
##  ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 3500.003
##  ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 3593.304
##  ARIMA(0,0,0)            with zero mean     : 3734.114
##  ARIMA(2,0,2)(0,0,1)[12] with non-zero mean : 3482.919
##  ARIMA(2,0,2)(1,0,0)[12] with non-zero mean : 3477.057
##  ARIMA(2,0,2)(2,0,1)[12] with non-zero mean : 3481.449
##  ARIMA(2,0,2)(1,0,2)[12] with non-zero mean : 3473.296
##  ARIMA(2,0,2)            with non-zero mean : 3506.728
##  ARIMA(2,0,2)(0,0,2)[12] with non-zero mean : 3479.29
##  ARIMA(2,0,2)(2,0,0)[12] with non-zero mean : 3480.486
##  ARIMA(2,0,2)(2,0,2)[12] with non-zero mean : 3483.594
##  ARIMA(1,0,2)(1,0,1)[12] with non-zero mean : 3469.97
##  ARIMA(1,0,2)(0,0,1)[12] with non-zero mean : 3485.609
##  ARIMA(1,0,2)(1,0,0)[12] with non-zero mean : 3476.579
##  ARIMA(1,0,2)(2,0,1)[12] with non-zero mean : 3479.809
##  ARIMA(1,0,2)(1,0,2)[12] with non-zero mean : 3472.011
##  ARIMA(1,0,2)            with non-zero mean : 3510.372
##  ARIMA(1,0,2)(0,0,2)[12] with non-zero mean : 3480.108
##  ARIMA(1,0,2)(2,0,0)[12] with non-zero mean : 3479.762
##  ARIMA(1,0,2)(2,0,2)[12] with non-zero mean : 3481.963
##  ARIMA(0,0,2)(1,0,1)[12] with non-zero mean : 3545.425
##  ARIMA(1,0,1)(1,0,1)[12] with non-zero mean : 3467.862
##  ARIMA(1,0,1)(0,0,1)[12] with non-zero mean : 3483.676
##  ARIMA(1,0,1)(1,0,0)[12] with non-zero mean : 3474.48
##  ARIMA(1,0,1)(2,0,1)[12] with non-zero mean : 3477.662
##  ARIMA(1,0,1)(1,0,2)[12] with non-zero mean : 3469.874
##  ARIMA(1,0,1)            with non-zero mean : 3510.482
##  ARIMA(1,0,1)(0,0,2)[12] with non-zero mean : 3477.973
##  ARIMA(1,0,1)(2,0,0)[12] with non-zero mean : 3477.657
##  ARIMA(1,0,1)(2,0,2)[12] with non-zero mean : 3479.794
##  ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : 3594.15
##  ARIMA(1,0,0)(1,0,1)[12] with non-zero mean : 3493.262
##  ARIMA(2,0,1)(1,0,1)[12] with non-zero mean : 3469.25
##  ARIMA(0,0,0)(1,0,1)[12] with non-zero mean : 3666.319
##  ARIMA(2,0,0)(1,0,1)[12] with non-zero mean : 3469.661
##  ARIMA(1,0,1)(1,0,1)[12] with zero mean     : 3466.238
##  ARIMA(1,0,1)(0,0,1)[12] with zero mean     : 3481.773
##  ARIMA(1,0,1)(1,0,0)[12] with zero mean     : 3472.697
##  ARIMA(1,0,1)(2,0,1)[12] with zero mean     : 3475.901
##  ARIMA(1,0,1)(1,0,2)[12] with zero mean     : 3468.242
##  ARIMA(1,0,1)            with zero mean     : 3508.494
##  ARIMA(1,0,1)(0,0,2)[12] with zero mean     : 3476.181
##  ARIMA(1,0,1)(2,0,0)[12] with zero mean     : 3475.781
##  ARIMA(1,0,1)(2,0,2)[12] with zero mean     : 3478.018
##  ARIMA(0,0,1)(1,0,1)[12] with zero mean     : 3592.363
##  ARIMA(1,0,0)(1,0,1)[12] with zero mean     : 3491.253
##  ARIMA(2,0,1)(1,0,1)[12] with zero mean     : 3467.794
##  ARIMA(1,0,2)(1,0,1)[12] with zero mean     : 3468.353
##  ARIMA(0,0,0)(1,0,1)[12] with zero mean     : 3664.864
##  ARIMA(0,0,2)(1,0,1)[12] with zero mean     : 3543.6
##  ARIMA(2,0,0)(1,0,1)[12] with zero mean     : 3467.91
##  ARIMA(2,0,2)(1,0,1)[12] with zero mean     : 3469.932
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(1,0,1)(1,0,1)[12] with zero mean     : 3471.867
## 
##  Best model: ARIMA(1,0,1)(1,0,1)[12] with zero mean
auto_parameters = auto_arima$arma #(p,q,P,Q,s,d,D)

By auto.arima function in “forecast” package, the best model is SARIMA(1,0,1)(1,0,1)[12] with zero mean

So, we have three candidate: a. ARMA(1,1) b. SARMA(1,0,1)(1,0,1)[12]

3.3.5 Candidate ARMA(p,q)

## ARMA(1,1)
model_arma11 = Arima(arma_data_ts, order = c(1,0,1))
model_arma11_fit = fitted(model_arma11)
res_arma11 = residuals(model_arma11)
std_res_arma11 = rstandard(model_arma11) 

## auto arima
auto_p = auto_parameters[1]
auto_q = auto_parameters[2]
auto_P = auto_parameters[3]
auto_Q = auto_parameters[4]
auto_s = auto_parameters[5]
auto_d = auto_parameters[6]
auto_D = auto_parameters[7]

model_arma_auto = Arima(arma_data_ts, 
                        order = c(auto_p,auto_d,auto_q),
                        seasonal = list(
                          order = c(auto_P,auto_D,auto_Q),
                          period = auto_s)
                        )
model_arma_auto_fit = fitted(model_arma_auto)
res_arma_auto = residuals(model_arma_auto)
std_res_arma_auto = rstandard(model_arma_auto) 

model_form = paste0("ARIMA(",
                    auto_p,",",
                    auto_d,",",
                    auto_q,")(",
                    auto_P,",",
                    auto_D,",",
                    auto_Q,")[",
                    auto_s,"]")
## plot
ts.plot(arma_data_ts, 
        model_arma11_fit,
        model_arma_auto_fit,
        col = c(1,2,4),
        lwd = c(1,2,2),
        lty = c(1,2,3), 
        
        main = paste0('ARMA(1,1) and  ',model_form,'Model'),
        ylim = c(min(arma_data_ts),
              max(arma_data_ts)))

# ;points(
#         arma_data_ts, col = 'black');legend(
#         'top', 
#         legend = c('data','arma11',
#                    paste0(model_form)),
#         col = c(1,2,4),lty = 1,cex = 0.5
#                       )

3.4 Diagnose

3.4.1 Outlier

plot(y=std_res_arma11  ,
     x=as.vector(time(std_res_arma11)),
     xlab='Time',
     ylab='Standardized Residuals',
     type='o',
     main = 'Satndard Residuals of ARMA(1,1) after de-trend & de-season');abline(a = 3,b = 0, 
     col = 'red', lty = 2);abline(a = -3,b = 0, 
     col = 'red', lty = 2);abline(a = 2,b = 0, 
     col = 'orange', lty = 2);abline(a = -2,b = 0, 
     col = 'orange', lty = 2);abline(a = 0,b = 0, 
     col = 'grey', lty = 2)

plot(y=std_res_arma_auto,
     x=as.vector(time(std_res_arma_auto)),
     ylim = c(-3.5, 3.5),
     xlab='Time',
     ylab='Standardized Residuals',
     type='o',
     main = paste0('Satndard Residuals of ARMA(',
     auto_p,',',auto_q,
     ') after de-trend & de-season'));abline(a = 3,b = 0, 
     col = 'red', lty = 2);abline(a = -3,b = 0, 
     col = 'red', lty = 2);abline(a = 2,b = 0, 
     col = 'orange', lty = 2);abline(a = -2,b = 0, 
     col = 'orange', lty = 2);abline(a = 0,b = 0, 
     col = 'grey', lty = 2)

3.4.2 White Noise

acf(as.vector(res_arma11), lag = 60, main = 'ACF : Residuals of ARMA(1,1)')

pacf(as.vector(res_arma11), lag = 60, main = 'PACF : Residuals of ARMA(1,1)')

acf(as.vector(res_arma_auto), lag = 60, 
    main = paste0('ACF : Residuals of ARMA(',auto_p,',',auto_q,')'))

pacf(as.vector(res_arma_auto), lag = 60, 
     main = paste0('PACF : Residuals of ARMA(',auto_p,',',auto_q,')'))

Consider that the residuals had better be whote noise, neither ARMA(1,1) nor ARMA(5,2) meets the ideal result because of the peaks with lag 12 on ACF plots, which means it’s definitely not independent white noise.Therefore, lets give stochastic method a try.

4 Stochastic Method

4.1 Stochastic-Trend

4.1.1 Dickey Fuller Test

H0: non-stationary with lag k H1: stationary with lag k

## stationary
adf.test(data_ts, alternative = c("stationary"),k = 1)
## Warning in adf.test(data_ts, alternative = c("stationary"), k = 1): p-value
## smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -5.156, Lag order = 1, p-value = 0.01
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 2)
## Warning in adf.test(data_ts, alternative = c("stationary"), k = 2): p-value
## smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -4.4843, Lag order = 2, p-value = 0.01
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 3)
## Warning in adf.test(data_ts, alternative = c("stationary"), k = 3): p-value
## smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -4.0484, Lag order = 3, p-value = 0.01
## alternative hypothesis: stationary
## non stationary
adf.test(data_ts, alternative = c("stationary"),k = 4)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -2.3887, Lag order = 4, p-value = 0.4129
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 5)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -1.5209, Lag order = 5, p-value = 0.777
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 6)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -1.5549, Lag order = 6, p-value = 0.7627
## alternative hypothesis: stationary
adf.test(data_ts, alternative = c("stationary"),k = 12)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_ts
## Dickey-Fuller = -0.56348, Lag order = 12, p-value = 0.9782
## alternative hypothesis: stationary

Althogh the above shows outputs of adf.test, I don’t really need it and won’t interpret the test.

4.1.2 ACF before difference

acf(data_ts, lag = 60)

pacf(data_ts, lag = 60)

The tail-off acf, nearly equal to 1.0 starting at lag 1, strongly implies the stochastic trend effect. So I try to import non-seasonal part of the arima model, which means lets take difference with lag 1.

4.1.3 Difference

## method I : arima model with d = 1
model_stoch_trend = Arima(data_ts, order = c(0,1,0))
model_stoch_trend_fit = fitted(model_stoch_trend)

ts.plot(data_ts,model_stoch_trend_fit ,
        col = c(1,2),lwd = c(1,2),lty = c(1,2), 
        main = 'Stochastic Trend Model',
     ylim = c(min(data_ts),max(data_ts)))

res_stoch_trend = residuals(model_stoch_trend)

## method II : diff 
diff(data_ts)==residuals(model_stoch_trend)
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 2000       FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2001  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2002  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2003  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2004  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2005  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2006  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2007  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2008  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2009  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2010  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2011  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2012  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2013  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2014  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2015  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2016  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2017  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##        Dec
## 2000  TRUE
## 2001  TRUE
## 2002  TRUE
## 2003  TRUE
## 2004  TRUE
## 2005  TRUE
## 2006  TRUE
## 2007  TRUE
## 2008  TRUE
## 2009  TRUE
## 2010  TRUE
## 2011  TRUE
## 2012  TRUE
## 2013  TRUE
## 2014  TRUE
## 2015  TRUE
## 2016  TRUE
## 2017  TRUE

4.1.4 ACF after difference

acf(as.vector(res_stoch_trend), lag = 60)

pacf(as.vector(res_stoch_trend), lag = 60)

Compare determinist linear model to stochastic trend model, I think it’s better to choose the latter one because I noticed that acf after difference shows repeatedly high auto-correlation with lag 12. It may help me more to interpret the further stochastic seasonal model.

4.2 Stochastic-Season

stoch_season_data = res_stoch_trend
## method I : arima model with D=1 and period=12
model_stoch_season = Arima(stoch_season_data,
        seasonal = list(order = c(0,1,0), period = 12))

model_stoch_season_fit = fitted(model_stoch_season)

ts.plot(stoch_season_data,
        model_stoch_season_fit ,
        col = c(1,2),lwd = c(1,1),lty = c(1,2), 
        main = 'Stochastic Season Model',
        ylim = c(min(stoch_season_data),
              max(stoch_season_data)))

res_stoch_season = residuals(model_stoch_season)

## method 2 : diff with lag 12
diff(stoch_season_data,lag = 12)==residuals(model_stoch_season)
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 2001 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2002  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2003  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2004  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2005  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2006  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2007  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2008  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2009  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2010  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2011  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2012  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2013  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2014  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2015  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2016  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## 2017  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##        Dec
## 2001 FALSE
## 2002 FALSE
## 2003  TRUE
## 2004  TRUE
## 2005  TRUE
## 2006  TRUE
## 2007  TRUE
## 2008  TRUE
## 2009  TRUE
## 2010  TRUE
## 2011  TRUE
## 2012  TRUE
## 2013  TRUE
## 2014  TRUE
## 2015  TRUE
## 2016  TRUE
## 2017  TRUE

4.3 ARMA

4.3.1 ACF and PACF

## if you dont import stochastic season effect
# stoch_arma_data = res_stoch_trend 

stoch_arma_data = res_stoch_season
plot(y=stoch_arma_data ,
     x=as.vector(time(stoch_arma_data)),
     xlab='Time',ylab='Residuals',type='o',
main = 'Residuals after de-trend and de-season')

acf(as.vector(stoch_arma_data), lag = 60, 
    main = 'ACF of Residuals after de-trend and de season')

pacf(as.vector(stoch_arma_data), lag = 60, 
     main = 'PACF of Residuals after de-trend and de season')

With low and irregular acf and pacf, it too hard for me to determine ARMA parameters p and q by my poor experience.

4.3.2 Potential parameters for ARMA(p,q)

4.3.2.1 Method I : EACF

It can only partially explain non-seasonal parameters p,q

eacf(stoch_arma_data)
## AR/MA
##   0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 0 x o o x o o x x x o x  x  x  o 
## 1 x x o x o o o x x o o  x  o  o 
## 2 x o x o o o o o o o o  x  o  o 
## 3 x x x o o o o o o o o  x  o  o 
## 4 x x x o x o o o o o o  x  o  o 
## 5 x x o x x o o o o o o  x  o  o 
## 6 x o o x o o o o o o o  x  o  o 
## 7 x o x x o o o o o o o  x  o  o

ARMA(0,1) may be a proper choice

4.4 Method II: auto.arima

auto_arima = auto.arima(stoch_arma_data,
           d = 0, D = 0,
           trace = T)
## 
##  Fitting models using approximations to speed things up...
## 
##  ARIMA(2,0,2)(1,0,1)[12] with non-zero mean : 3488.884
##  ARIMA(0,0,0)            with non-zero mean : 3581.087
##  ARIMA(1,0,0)(1,0,0)[12] with non-zero mean : 3504.55
##  ARIMA(0,0,1)(0,0,1)[12] with non-zero mean : 3467.868
##  ARIMA(0,0,0)            with zero mean     : 3579.179
##  ARIMA(0,0,1)            with non-zero mean : 3522.59
##  ARIMA(0,0,1)(1,0,1)[12] with non-zero mean : 3482.222
##  ARIMA(0,0,1)(0,0,2)[12] with non-zero mean : 3469.939
##  ARIMA(0,0,1)(1,0,0)[12] with non-zero mean : 3500.84
##  ARIMA(0,0,1)(1,0,2)[12] with non-zero mean : Inf
##  ARIMA(0,0,0)(0,0,1)[12] with non-zero mean : 3510.307
##  ARIMA(1,0,1)(0,0,1)[12] with non-zero mean : 3470.948
##  ARIMA(0,0,2)(0,0,1)[12] with non-zero mean : 3469.961
##  ARIMA(1,0,0)(0,0,1)[12] with non-zero mean : 3473.223
##  ARIMA(1,0,2)(0,0,1)[12] with non-zero mean : 3472.868
##  ARIMA(0,0,1)(0,0,1)[12] with zero mean     : 3466.88
##  ARIMA(0,0,1)            with zero mean     : 3520.959
##  ARIMA(0,0,1)(1,0,1)[12] with zero mean     : 3481.284
##  ARIMA(0,0,1)(0,0,2)[12] with zero mean     : 3468.933
##  ARIMA(0,0,1)(1,0,0)[12] with zero mean     : 3499.465
##  ARIMA(0,0,1)(1,0,2)[12] with zero mean     : Inf
##  ARIMA(0,0,0)(0,0,1)[12] with zero mean     : 3508.579
##  ARIMA(1,0,1)(0,0,1)[12] with zero mean     : 3469.923
##  ARIMA(0,0,2)(0,0,1)[12] with zero mean     : 3468.938
##  ARIMA(1,0,0)(0,0,1)[12] with zero mean     : 3471.799
##  ARIMA(1,0,2)(0,0,1)[12] with zero mean     : 3471.794
## 
##  Now re-fitting the best model(s) without approximations...
## 
##  ARIMA(0,0,1)(0,0,1)[12] with zero mean     : 3472.036
## 
##  Best model: ARIMA(0,0,1)(0,0,1)[12] with zero mean
auto_parameters = auto_arima$arma #(p,q,P,Q,s,d,D)
auto_p = auto_parameters[1]
auto_q = auto_parameters[2]
auto_P = auto_parameters[3]
auto_Q = auto_parameters[4]
auto_s = auto_parameters[5]
auto_d = auto_parameters[6]
auto_D = auto_parameters[7]

According to the auto.arima, the best model is SARIMA(p=0,d=0,q=1)(P=0,D=0,Q=1) with lag 12 and without drift.

4.4.1 Candidate SARIMA(p,d,q)(P,D,Q)[s]

model_stoch_arma_auto = Arima(stoch_arma_data, 
                        order = c(auto_p,auto_d,auto_q),
                        seasonal = list(
                          order = c(auto_P,auto_D,auto_Q),
                          period = auto_s)
                        )
model_stoch_arma_auto_fit = (stoch_arma_data - model_stoch_arma_auto$residuals)
#model_stoch_arma_auto_fit == fitted(model_stoch_arma_auto)


res_stoch_arma_auto = residuals(model_stoch_arma_auto)
std_res_stoch_arma_auto = rstandard(model_stoch_arma_auto) 

# res_stoch_arma_auto  == model_stoch_arma_auto$residuals

ts.plot(stoch_arma_data,
        model_stoch_arma_auto_fit ,
        col = c(1,2),lwd = c(1,1),lty = c(1,2), 
        main = 'SARMA Model',
        ylim = c(min(stoch_arma_data),
              max(stoch_arma_data)))

4.5 Diagnose

4.5.1 Outlier

# plot(res_stoch_arma_auto) ## just check
plot(y=std_res_stoch_arma_auto,
     x=as.vector(time(std_res_stoch_arma_auto)),
     xlab='Time',
     ylab='Standardized Residuals',
     type='o',
     main = 'Satndard Residuals of ARMA after de-trend & de-season');abline(a = 3,b = 0, 
     col = 'red', lty = 2);abline(a = -3,b = 0, 
     col = 'red', lty = 2);abline(a = 2,b = 0, 
     col = 'orange', lty = 2);abline(a = -2,b = 0, 
     col = 'orange', lty = 2);abline(a = 0,b = 0, 
     col = 'grey', lty = 2)

4.5.2 White Noise

acf(as.vector(res_stoch_arma_auto), 
    lag = 60, main = 'ACF : Residuals of SARMA')

pacf(as.vector(res_stoch_arma_auto), 
     lag = 60, main = 'PACF : Residuals of SARMA')

Before prediction, let’s summarise the model info. Firstly, we apply linear model or difference into the de-trend process; secondly, we apply seasonal model or difference with a lag into the de-season process.After trend and seasonal effects are removed, we established an ARMA or a Seasonal ARMA model to fit the data and expect that the residuals could be white noise.

Therefore, the whole fitting model of Determinist Method would contain three parts : ARMA, Seasonal Model, Trend Model \[ (1-\Sigma_{i}^{p}{\phi_i B_i})Y_t = (1-\Sigma_{j}^{q}{\theta_j B^j})e_t + (\Sigma{\gamma_k m_k}) + (\beta_1 t + \beta_0)\] \[ B : lag \space operator \] \[p,q : non-seasonal \space ARMA \space parameter\]

\[m_i : dummy \space variable \space of \space seasonal\space effect\]

\[\gamma : coefficient \space of \space season\space effect\]

\[\beta : coefficient \space of \space trend\space effect\]

The Whole Stochastic Model would be like: \[ (1-B)^d(1-B^s)^D(1-\Sigma_{i}^{p}{\phi_i B^i})(1-\Sigma_{j}^{P}{\Phi_j B^{js}})Y_t = (1-\Sigma_{k}^{q}{\theta_k B^k})(1-\Sigma_{l}^{Q}{\theta_l B^{ls}})e_t \]

\[ B : lag \space operator \] \[p,d,q : nonseasonal \space ARMA \space parameter\] \[P,D,Q,s : seasonal \space ARMA \space parameter\]

4.5.3 Simplify Whole Stochastic Model

The whole stochastic model go through once difference for de-trend, once defference with lag 12 for de-season,and fitting a seasonal ARMA(0,0,1)(0,0,1)[12] model.Now we can simplify the whole model as a SARIMA(0,1,1)(0,1,1)[12]

4.5.3.1 Stochastic Method

model_sarima = Arima(y = data_ts,
        order = c(0,1,1),
        seasonal = list(order = c(0,1,1), 
                        period = 12))

model_sarima_fit = (data_ts - model_sarima$residuals)
res_sarima =  model_sarima$residuals

## I don't know why the residual not the same
res_sarima == res_stoch_arma_auto
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov
## 2000 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2001 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2002 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2003 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2004 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2005 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2006 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2007 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2008 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2009 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2010 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2011 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2012 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2013 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2014 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2015 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2016 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2017 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##        Dec
## 2000 FALSE
## 2001 FALSE
## 2002 FALSE
## 2003 FALSE
## 2004 FALSE
## 2005 FALSE
## 2006 FALSE
## 2007 FALSE
## 2008 FALSE
## 2009 FALSE
## 2010 FALSE
## 2011 FALSE
## 2012 FALSE
## 2013 FALSE
## 2014 FALSE
## 2015 FALSE
## 2016 FALSE
## 2017 FALSE
res_sarima - res_stoch_arma_auto
##              Jan         Feb         Mar         Apr         May
## 2000   -7.388421  -13.379831  -16.754230  -18.460976  -19.258907
## 2001   91.913477   32.066886  -63.148777   -8.237246   19.998097
## 2002  -31.211981  -50.492115 -104.884810  -36.638362   -6.046498
## 2003 -112.714044  -81.221772 -100.937725  -62.623669  -33.354676
## 2004  -99.269968  -52.008779  -85.536683  -52.234578  -33.824281
## 2005  -71.580000  -54.517753  -57.747440  -51.833419  -48.655191
## 2006  -64.095510  -45.304412  -54.931416  -46.106965  -43.176203
## 2007  -56.167327  -48.178956  -56.390167  -48.721637  -55.157659
## 2008  -53.152595  -55.272543  -56.456240  -54.503590  -54.563919
## 2009  -68.732052  -54.532725  -58.345299  -53.764879  -56.640961
## 2010  -57.138055  -65.181011  -59.594780  -68.418156  -62.871350
## 2011  -59.901603  -54.351825  -48.122887  -51.521428  -53.675621
## 2012  -60.158533  -50.373032  -64.422283  -62.102258  -52.441293
## 2013  -60.494723  -64.290823  -51.392051  -55.501599  -47.211391
## 2014  -65.901702  -62.926529  -46.761600  -39.905645  -57.056021
## 2015  -66.751615  -38.811912  -36.728254  -43.943805  -48.706085
## 2016  -60.325330  -50.588472  -57.563262  -55.169705  -47.273278
## 2017  -60.409694  -60.086985  -56.711489  -51.672911  -42.193441
##              Jun         Jul         Aug         Sep         Oct
## 2000  -19.505662  -19.647836  -19.709423  -19.740683  -19.760895
## 2001   32.782089   -4.493858  -28.346272   20.902271   -9.232743
## 2002   12.239330  -42.110521  -54.217819    1.108370  -42.254380
## 2003  -17.047984  -59.135007  -50.539326  -20.192136  -54.261378
## 2004  -33.215939  -54.841724  -42.199865  -31.964886  -57.560147
## 2005  -47.169646  -56.925528  -50.678588  -48.028407  -60.107645
## 2006  -47.436859  -54.063278  -51.318725  -48.673067  -52.119936
## 2007  -53.906868  -56.250546  -57.516933  -45.543389  -55.050814
## 2008  -51.641968  -54.658515  -49.053958  -57.857195  -56.648718
## 2009  -54.194310  -56.274781  -58.521807  -52.684202  -59.216833
## 2010  -58.790996  -57.981531  -61.734323  -67.188984  -59.295454
## 2011  -52.867176  -63.560171  -58.469123  -55.470063  -52.500305
## 2012  -47.421900  -39.877735  -53.105350  -45.998019  -56.422362
## 2013  -58.496624  -68.535778  -59.064140  -65.219589  -57.495593
## 2014  -56.323280  -60.956352  -61.309081  -56.613197  -62.698331
## 2015  -48.815440  -64.212105  -44.022899  -44.226835  -53.144604
## 2016  -58.289453  -50.292390  -50.285565  -45.266509  -46.811745
## 2017  -55.416862  -49.550631  -63.597979  -65.045352  -51.241549
##              Nov         Dec
## 2000  -19.588185  -17.682287
## 2001  -44.370116   15.359414
## 2002  -77.324266    9.635419
## 2003  -67.305151  -17.183525
## 2004  -60.187812  -45.495841
## 2005  -58.136819  -52.109853
## 2006  -57.327953  -50.271398
## 2007  -54.196324  -49.220898
## 2008  -57.828290  -59.019607
## 2009  -57.396712  -58.702715
## 2010  -53.531467  -56.656636
## 2011  -54.211815  -57.555366
## 2012  -72.266387  -62.710972
## 2013  -61.196283  -66.823191
## 2014  -60.070073  -56.267072
## 2015  -57.257195  -61.938140
## 2016  -46.098064  -57.416653
## 2017  -37.091123  -49.235542
## But the acf and pacf pattern are nearly equal
par(mfrow = c(2,2));acf(as.vector(res_sarima), 
    lag = 60, main = 'ACF : Residuals of SARIMA');pacf(as.vector(res_sarima), 
     lag = 60, main = 'PACF : Residuals of SARIMA');acf(as.vector(res_stoch_arma_auto), 
    lag = 60, main = 'ACF : Residuals of whole split models previously');pacf(as.vector(res_stoch_arma_auto), 
     lag = 60, main = 'PACF : Residuals of whole split models previously')

4.5.3.2 Determinist Method

dummy =  seasonaldummy(ts(data_ts,f=12))
var_time = time(data_ts)
xreg = model.matrix(~dummy + var_time)

model_det_arima = Arima(y = data_ts,
                     xreg = xreg ,
                     include.mean=FALSE ,
                     order = c(1,0,1),
                     seasonal=list(order=c(1,0,1), period=12)
                    )

model_det_arima_fit = (data_ts - model_det_arima $residuals)
res_det_arima  =  model_det_arima$residuals

4.5.3.3 Mix Method I

Determinist de-trend + Stochastic de-season + SARIMA

xreg = time(data_ts)

model_mix_arima = Arima(y = data_ts,
                     xreg = xreg,
                     order = c(1,0,1),
                     seasonal=list(order=c(1,1,1), period=12)
                    )

model_mix_arima_fit = (data_ts - model_mix_arima$residuals)
res_mix_arima  =  model_mix_arima$residuals

4.5.3.4 Mix Method II

Stochastic de-trend + Determinist de-season + SARIMA I met some Non-Stationary Problems

# dummy =  seasonaldummy(ts(data_ts,f=12))
# xreg = model.matrix(~dummy)
# 
# model_mix_arima = Arima(y =data_ts,
#                      xreg = xreg,
#                      order = c(1,0,1),
#                      include.mean=FALSE ,
#                      seasonal=list(order=c(1,0,1), 
#                                    period=12)
#                     )
# 
# model_mix_arima_fit = (data_ts - model_mix_arima$residuals)
# res_mix_arima  =  model_mix_arima$residuals

5 Prediction

5.1 Stochastic

pred_sarima = forecast(model_sarima, h = forecast_num)


plot(pred_sarima);lines(pred_sarima$fitted,col="red");lines(target_data_ts,col="black")

5.2 Determinist

test_dummy =  seasonaldummy(ts(test_ts,f=12))
test_var_time = time(test_ts)
test_xreg = model.matrix(~test_dummy + test_var_time)

pred_det_arima = forecast(model_det_arima, 
                          h = forecast_num,
                          xreg = test_xreg)
## Warning in forecast.Arima(model_det_arima, h = forecast_num, xreg =
## test_xreg): xreg contains different column names from the xreg used in
## training. Please check that the regressors are in the same order.
plot(pred_det_arima);lines(pred_det_arima$fitted,col="red");lines(target_data_ts,col="black")

5.3 Mix Model

test_xreg = time(test_ts)

pred_mix_arima = forecast(model_mix_arima, 
                          h = forecast_num,
                          xreg = test_xreg)


par(bg="#FFFFFF");plot(pred_mix_arima, main = 'ARIMA(1,0,1)(1,1,1)[12]' );lines(target_data_ts,col="black");rect(0, 0, 2001, 31000,col = "#f285a233");rect(2001, 0, 2018, 31000,col = "#f2d58533", );rect(2018, 0, 2020, 31000,col = "#85def233", );lines(pred_mix_arima$fitted,col="red");

Thanks for reading !